import pandas as pd # required libraries imported
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
df=pd.read_csv("C:\\SZ\\GLCA DA PROJECTS\\New folder\\Apps_data+(1).csv")
df # data set is red
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
| 1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
| 2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
| 3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25M | 50,000,000+ | Free | 0 | Teen | Art & Design | June 8, 2018 | Varies with device | 4.2 and up |
| 4 | Pixel Draw - Number Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2.8M | 100,000+ | Free | 0 | Everyone | Art & Design;Creativity | June 20, 2018 | 1.1 | 4.4 and up |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10836 | Sya9a Maroc - FR | FAMILY | 4.5 | 38 | 53M | 5,000+ | Free | 0 | Everyone | Education | July 25, 2017 | 1.48 | 4.1 and up |
| 10837 | Fr. Mike Schmitz Audio Teachings | FAMILY | 5.0 | 4 | 3.6M | 100+ | Free | 0 | Everyone | Education | July 6, 2018 | 1.0 | 4.1 and up |
| 10838 | Parkinson Exercices FR | MEDICAL | NaN | 3 | 9.5M | 1,000+ | Free | 0 | Everyone | Medical | January 20, 2017 | 1.0 | 2.2 and up |
| 10839 | The SCP Foundation DB fr nn5n | BOOKS_AND_REFERENCE | 4.5 | 114 | Varies with device | 1,000+ | Free | 0 | Mature 17+ | Books & Reference | January 19, 2015 | Varies with device | Varies with device |
| 10840 | iHoroscope - 2018 Daily Horoscope & Astrology | LIFESTYLE | 4.5 | 398307 | 19M | 10,000,000+ | Free | 0 | Everyone | Lifestyle | July 25, 2018 | Varies with device | Varies with device |
10841 rows × 13 columns
df.sample(5) # random few samples checked
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7253 | CF Calculator | LIFESTYLE | 3.3 | 25 | 4.7M | 1,000+ | Free | 0 | Everyone | Lifestyle | March 10, 2015 | 1.0.2 | 4.0 and up |
| 5072 | AF Johannesburg | FAMILY | NaN | 3 | 11M | 100+ | Free | 0 | Everyone | Education | May 2, 2017 | 1.0.3 | 4.1 and up |
| 7844 | Map Callouts for CS:GO | GAME | 4.3 | 211 | 3.6M | 50,000+ | Free | 0 | Everyone | Action | January 26, 2018 | 2.1 | 4.0.3 and up |
| 9114 | My Ooredoo Algeria | TOOLS | 4.2 | 3606 | 17M | 100,000+ | Free | 0 | Everyone | Tools | June 24, 2018 | 1.22.1 | 4.4 and up |
| 5322 | Al-Moazin Lite (Prayer Times) | LIFESTYLE | 4.6 | 284670 | Varies with device | 10,000,000+ | Free | 0 | Everyone | Lifestyle | July 24, 2018 | Varies with device | Varies with device |
df.shape # shape checked indicating presence of 10841 rows and 13 columns
(10841, 13)
df.info() # we can infere that only one numerical column is present and all other are categorical and also there are 13 columns and 10841 rows
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10841 entries, 0 to 10840 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 App 10841 non-null object 1 Category 10841 non-null object 2 Rating 9367 non-null float64 3 Reviews 10841 non-null object 4 Size 10841 non-null object 5 Installs 10841 non-null object 6 Type 10840 non-null object 7 Price 10841 non-null object 8 Content Rating 10840 non-null object 9 Genres 10841 non-null object 10 Last Updated 10841 non-null object 11 Current Ver 10833 non-null object 12 Android Ver 10838 non-null object dtypes: float64(1), object(12) memory usage: 1.1+ MB
df.describe(include='all') # summary stats checked
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10841 | 10841 | 9367.000000 | 10841 | 10841 | 10841 | 10840 | 10841 | 10840 | 10841 | 10841 | 10833 | 10838 |
| unique | 9660 | 34 | NaN | 6002 | 462 | 22 | 3 | 93 | 6 | 120 | 1378 | 2832 | 33 |
| top | ROBLOX | FAMILY | NaN | 0 | Varies with device | 1,000,000+ | Free | 0 | Everyone | Tools | August 3, 2018 | Varies with device | 4.1 and up |
| freq | 9 | 1972 | NaN | 596 | 1695 | 1579 | 10039 | 10040 | 8714 | 842 | 326 | 1459 | 2451 |
| mean | NaN | NaN | 4.193338 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| std | NaN | NaN | 0.537431 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| min | NaN | NaN | 1.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 25% | NaN | NaN | 4.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 50% | NaN | NaN | 4.300000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 75% | NaN | NaN | 4.500000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| max | NaN | NaN | 19.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df.select_dtypes(include='object').columns.tolist() # categories columns ,need to be worked on before model building
['App', 'Category', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
df.columns[df.isna().any()].tolist() # Nan columns , need to be workded on before model building
['Rating', 'Type', 'Content Rating', 'Current Ver', 'Android Ver']
df1=df.copy() # for a safe move
df[df.duplicated()] #483 rows with duplicates
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 229 | Quick PDF Scanner + OCR FREE | BUSINESS | 4.2 | 80805 | Varies with device | 5,000,000+ | Free | 0 | Everyone | Business | February 26, 2018 | Varies with device | 4.0.3 and up |
| 236 | Box | BUSINESS | 4.2 | 159872 | Varies with device | 10,000,000+ | Free | 0 | Everyone | Business | July 31, 2018 | Varies with device | Varies with device |
| 239 | Google My Business | BUSINESS | 4.4 | 70991 | Varies with device | 5,000,000+ | Free | 0 | Everyone | Business | July 24, 2018 | 2.19.0.204537701 | 4.4 and up |
| 256 | ZOOM Cloud Meetings | BUSINESS | 4.4 | 31614 | 37M | 10,000,000+ | Free | 0 | Everyone | Business | July 20, 2018 | 4.1.28165.0716 | 4.0 and up |
| 261 | join.me - Simple Meetings | BUSINESS | 4.0 | 6989 | Varies with device | 1,000,000+ | Free | 0 | Everyone | Business | July 16, 2018 | 4.3.0.508 | 4.4 and up |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8643 | Wunderlist: To-Do List & Tasks | PRODUCTIVITY | 4.6 | 404610 | Varies with device | 10,000,000+ | Free | 0 | Everyone | Productivity | April 6, 2018 | Varies with device | Varies with device |
| 8654 | TickTick: To Do List with Reminder, Day Planner | PRODUCTIVITY | 4.6 | 25370 | Varies with device | 1,000,000+ | Free | 0 | Everyone | Productivity | August 6, 2018 | Varies with device | Varies with device |
| 8658 | ColorNote Notepad Notes | PRODUCTIVITY | 4.6 | 2401017 | Varies with device | 100,000,000+ | Free | 0 | Everyone | Productivity | June 27, 2018 | Varies with device | Varies with device |
| 10049 | Airway Ex - Intubate. Anesthetize. Train. | MEDICAL | 4.3 | 123 | 86M | 10,000+ | Free | 0 | Everyone | Medical | June 1, 2018 | 0.6.88 | 5.0 and up |
| 10768 | AAFP | MEDICAL | 3.8 | 63 | 24M | 10,000+ | Free | 0 | Everyone | Medical | June 22, 2018 | 2.3.1 | 5.0 and up |
483 rows × 13 columns
df.drop_duplicates(inplace=True) # duplicated rows dropped
df.shape # number of rows reduced from 10841 to 10358 .Therefore we can come to know that 483 rows were dropped
(10358, 13)
df['Category'].unique()
array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION',
'1.9'], dtype=object)
invalid = df[df["Category"] == "1.9"] # we can see that '1.9' is something irrelevant to thhis column
invalid
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10472 | Life Made WI-Fi Touchscreen Photo Frame | 1.9 | 19.0 | 3.0M | 1,000+ | Free | 0 | Everyone | NaN | February 11, 2018 | 1.0.19 | 4.0 and up | NaN |
df.drop(10472,inplace=True) # droping the invalid value row corresponding to the 'Category' column
df.shape # one row got reduced
(10357, 13)
df[df['Rating'].isnull()==True].index # index of the rows with missing rating
Int64Index([ 23, 113, 123, 126, 129, 130, 134, 163, 180,
185,
...
10816, 10818, 10821, 10822, 10823, 10824, 10825, 10831, 10835,
10838],
dtype='int64', length=1465)
df.drop(df[df['Rating'].isnull()==True].index,inplace=True) # Dropping off records with missing rating
df.shape # The Data after dropping the rows having null Ratings consists of 8892 rows and 13 columns
(8892, 13)
# Defining a fuction to classify the ratings.
def Rating_category(value):
if value <= 3.5:
return "Low"
elif value > 3.5:
return "High"
df["Rating_category"] = df['Rating'].apply(Rating_category) # Creating the column Rating_category and classifying records using apply fucntion
df
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | Rating_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up | High |
| 1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up | High |
| 2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up | High |
| 3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 215644 | 25M | 50,000,000+ | Free | 0 | Teen | Art & Design | June 8, 2018 | Varies with device | 4.2 and up | High |
| 4 | Pixel Draw - Number Art Coloring Book | ART_AND_DESIGN | 4.3 | 967 | 2.8M | 100,000+ | Free | 0 | Everyone | Art & Design;Creativity | June 20, 2018 | 1.1 | 4.4 and up | High |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10834 | FR Calculator | FAMILY | 4.0 | 7 | 2.6M | 500+ | Free | 0 | Everyone | Education | June 18, 2017 | 1.0.0 | 4.1 and up | High |
| 10836 | Sya9a Maroc - FR | FAMILY | 4.5 | 38 | 53M | 5,000+ | Free | 0 | Everyone | Education | July 25, 2017 | 1.48 | 4.1 and up | High |
| 10837 | Fr. Mike Schmitz Audio Teachings | FAMILY | 5.0 | 4 | 3.6M | 100+ | Free | 0 | Everyone | Education | July 6, 2018 | 1.0 | 4.1 and up | High |
| 10839 | The SCP Foundation DB fr nn5n | BOOKS_AND_REFERENCE | 4.5 | 114 | Varies with device | 1,000+ | Free | 0 | Mature 17+ | Books & Reference | January 19, 2015 | Varies with device | Varies with device | High |
| 10840 | iHoroscope - 2018 Daily Horoscope & Astrology | LIFESTYLE | 4.5 | 398307 | 19M | 10,000,000+ | Free | 0 | Everyone | Lifestyle | July 25, 2018 | Varies with device | Varies with device | High |
8892 rows × 14 columns
#Distribution of Rating_category
distribution = df["Rating_category"].value_counts()
distribution
High 8012 Low 880 Name: Rating_category, dtype: int64
#plotting a histogram to see the distribution
df["Rating_category"].hist()
<Axes: >
Most of the apps are rated above 3.5 or most of the users have rated above 3.5 and very few of the apps are rated below 3.5
df['Reviews']=df['Reviews'].astype('int') #converting into int data type
df['Reviews'].dtypes # checking datatype
dtype('int32')
import plotly.express as px
fig = px.box(df['Reviews'])
fig.show() # box plot before outlier removal. The dots shows that outliers are present
# Applying Log Transformation to the column using Log10.
log10 = np.log10(df["Reviews"])
# Checking The Descriptive Statistics after applying Log Transformation.
log10.describe()
count 8892.000000 mean 3.576368 std 1.685149 min 0.000000 25% 2.214844 50% 3.673435 75% 4.852887 max 7.892975 Name: Reviews, dtype: float64
fig = px.box(log10)
fig.show() # box plot after outlier removal.we could see a clean box plot with no outliers
# Replacing reviews values with their transformed values.
df["Reviews"] = log10
df['Size']
0 19M
1 14M
2 8.7M
3 25M
4 2.8M
...
10834 2.6M
10836 53M
10837 3.6M
10839 Varies with device
10840 19M
Name: Size, Length: 8892, dtype: object
df["Size"] = df["Size"].apply(lambda x : x.replace(",","")) #some cells are having ','. So,we are replacing that with ''
# Replacing alphabetical to numerical
df["Size"] = df["Size"].str.replace("M","000000")
df["Size"] = df["Size"].str.replace("k","000")
df[df['Size']=='Varies with device'].index #finding index of the rows with 'Size' value as 'Varies with device'
Int64Index([ 37, 42, 52, 67, 68, 73, 85, 88, 89,
92,
...
10647, 10679, 10681, 10707, 10712, 10713, 10725, 10765, 10826,
10839],
dtype='int64', length=1468)
df.drop(df[df['Size']=='Varies with device'].index,inplace=True)
df.shape
(7424, 14)
# Conversion of column named SIZE from Object dtype to the best Suitable dtype.
df["Size"].convert_dtypes()
0 19000000
1 14000000
2 8.7000000
3 25000000
4 2.8000000
...
10833 619000
10834 2.6000000
10836 53000000
10837 3.6000000
10840 19000000
Name: Size, Length: 7424, dtype: string
df['Installs'] # the unwanted charecters are '+',",". So, we need to remove them
0 10,000+
1 500,000+
2 5,000,000+
3 50,000,000+
4 100,000+
...
10833 1,000+
10834 500+
10836 5,000+
10837 100+
10840 10,000,000+
Name: Installs, Length: 7424, dtype: object
#Replacing the Unwanted Characters in the dataset
df["Installs"] = df["Installs"].str.replace("+","")
C:\Users\schwa_0iywr9u\AppData\Local\Temp\ipykernel_20912\2004903034.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df["Installs"]
0 10,000
1 500,000
2 5,000,000
3 50,000,000
4 100,000
...
10833 1,000
10834 500
10836 5,000
10837 100
10840 10,000,000
Name: Installs, Length: 7424, dtype: object
# Conversion of column named Installs from Object dtype to the best Suitable dtype.
df["Installs"].convert_dtypes()
0 10,000
1 500,000
2 5,000,000
3 50,000,000
4 100,000
...
10833 1,000
10834 500
10836 5,000
10837 100
10840 10,000,000
Name: Installs, Length: 7424, dtype: string
df.head()
| App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | Rating_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 2.201397 | 19000000 | 10,000 | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up | High |
| 1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 2.985426 | 14000000 | 500,000 | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up | High |
| 2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 4.942058 | 8.7000000 | 5,000,000 | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up | High |
| 3 | Sketch - Draw & Paint | ART_AND_DESIGN | 4.5 | 5.333737 | 25000000 | 50,000,000 | Free | 0 | Teen | Art & Design | June 8, 2018 | Varies with device | 4.2 and up | High |
| 4 | Pixel Draw - Number Art Coloring Book | ART_AND_DESIGN | 4.3 | 2.985426 | 2.8000000 | 100,000 | Free | 0 | Everyone | Art & Design;Creativity | June 20, 2018 | 1.1 | 4.4 and up | High |
df["Price"]
0 0
1 0
2 0
3 0
4 0
..
10833 0
10834 0
10836 0
10837 0
10840 0
Name: Price, Length: 7424, dtype: object
df["Price"].unique()
array(['0', '$4.99', '$6.99', '$7.99', '$3.99', '$5.99', '$2.99', '$1.99',
'$9.99', '$0.99', '$9.00', '$5.49', '$10.00', '$24.99', '$11.99',
'$79.99', '$16.99', '$14.99', '$29.99', '$12.99', '$3.49',
'$10.99', '$7.49', '$1.50', '$19.99', '$15.99', '$33.99', '$39.99',
'$2.49', '$4.49', '$1.70', '$1.49', '$3.88', '$399.99', '$17.99',
'$400.00', '$3.02', '$1.76', '$4.84', '$4.77', '$1.61', '$1.59',
'$6.49', '$1.29', '$299.99', '$379.99', '$37.99', '$18.99',
'$389.99', '$8.49', '$1.75', '$14.00', '$2.00', '$3.08', '$2.59',
'$19.40', '$15.46', '$8.99', '$3.04', '$13.99', '$4.29', '$3.28',
'$4.60', '$1.00', '$2.90', '$1.97', '$2.56', '$1.20'], dtype=object)
#Replacing the Unwanted Characters in the dataset
df["Price"] = df["Price"].str.replace("$", "")
C:\Users\schwa_0iywr9u\AppData\Local\Temp\ipykernel_20912\3914899130.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df["Price"].unique()
array(['0', '4.99', '6.99', '7.99', '3.99', '5.99', '2.99', '1.99',
'9.99', '0.99', '9.00', '5.49', '10.00', '24.99', '11.99', '79.99',
'16.99', '14.99', '29.99', '12.99', '3.49', '10.99', '7.49',
'1.50', '19.99', '15.99', '33.99', '39.99', '2.49', '4.49', '1.70',
'1.49', '3.88', '399.99', '17.99', '400.00', '3.02', '1.76',
'4.84', '4.77', '1.61', '1.59', '6.49', '1.29', '299.99', '379.99',
'37.99', '18.99', '389.99', '8.49', '1.75', '14.00', '2.00',
'3.08', '2.59', '19.40', '15.46', '8.99', '3.04', '13.99', '4.29',
'3.28', '4.60', '1.00', '2.90', '1.97', '2.56', '1.20'],
dtype=object)
# Conversion of column named Price from Object dtype to the best Suitable dtype.
df["Price"].convert_dtypes()
0 0
1 0
2 0
3 0
4 0
..
10833 0
10834 0
10836 0
10837 0
10840 0
Name: Price, Length: 7424, dtype: string
#Getting all the columns of the dataset.
df.columns
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
'Android Ver', 'Rating_category'],
dtype='object')
# Dropping the redundant columns from the datset.
df.drop(["App","Rating","Genres","Last Updated","Current Ver","Android Ver"], axis = 1,inplace = True)
df
| Category | Reviews | Size | Installs | Type | Price | Content Rating | Rating_category | |
|---|---|---|---|---|---|---|---|---|
| 0 | ART_AND_DESIGN | 2.201397 | 19000000 | 10,000 | Free | 0 | Everyone | High |
| 1 | ART_AND_DESIGN | 2.985426 | 14000000 | 500,000 | Free | 0 | Everyone | High |
| 2 | ART_AND_DESIGN | 4.942058 | 8.7000000 | 5,000,000 | Free | 0 | Everyone | High |
| 3 | ART_AND_DESIGN | 5.333737 | 25000000 | 50,000,000 | Free | 0 | Teen | High |
| 4 | ART_AND_DESIGN | 2.985426 | 2.8000000 | 100,000 | Free | 0 | Everyone | High |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10833 | BOOKS_AND_REFERENCE | 1.643453 | 619000 | 1,000 | Free | 0 | Everyone | High |
| 10834 | FAMILY | 0.845098 | 2.6000000 | 500 | Free | 0 | Everyone | High |
| 10836 | FAMILY | 1.579784 | 53000000 | 5,000 | Free | 0 | Everyone | High |
| 10837 | FAMILY | 0.602060 | 3.6000000 | 100 | Free | 0 | Everyone | High |
| 10840 | LIFESTYLE | 5.600218 | 19000000 | 10,000,000 | Free | 0 | Everyone | High |
7424 rows × 8 columns
df.dtypes
Category object Reviews float64 Size object Installs object Type object Price object Content Rating object Rating_category object dtype: object
1."Category"
# Getting count of Unique categories from Category column.
df.Category.nunique()
33
# Importing Label Encoder in order to perform ordinal encoding.
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
# Transforming the Category Column using Ordinal Encoding.
df["Category"] = enc.fit_transform(df[["Category"]])
df # column category encoded
| Category | Reviews | Size | Installs | Type | Price | Content Rating | Rating_category | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 2.201397 | 19000000 | 10,000 | Free | 0 | Everyone | High |
| 1 | 0.0 | 2.985426 | 14000000 | 500,000 | Free | 0 | Everyone | High |
| 2 | 0.0 | 4.942058 | 8.7000000 | 5,000,000 | Free | 0 | Everyone | High |
| 3 | 0.0 | 5.333737 | 25000000 | 50,000,000 | Free | 0 | Teen | High |
| 4 | 0.0 | 2.985426 | 2.8000000 | 100,000 | Free | 0 | Everyone | High |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10833 | 3.0 | 1.643453 | 619000 | 1,000 | Free | 0 | Everyone | High |
| 10834 | 11.0 | 0.845098 | 2.6000000 | 500 | Free | 0 | Everyone | High |
| 10836 | 11.0 | 1.579784 | 53000000 | 5,000 | Free | 0 | Everyone | High |
| 10837 | 11.0 | 0.602060 | 3.6000000 | 100 | Free | 0 | Everyone | High |
| 10840 | 18.0 | 5.600218 | 19000000 | 10,000,000 | Free | 0 | Everyone | High |
7424 rows × 8 columns
2."Content Rating"
# Getting Unique ratings from Content Rating column.
df["Content Rating"].unique()
array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
'Adults only 18+', 'Unrated'], dtype=object)
# Transforming the Category Column using ordinal Encoding.
df["Content Rating"] = enc.fit_transform(df[["Content Rating"]])
df
| Category | Reviews | Size | Installs | Type | Price | Content Rating | Rating_category | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 2.201397 | 19000000 | 10,000 | Free | 0 | 1.0 | High |
| 1 | 0.0 | 2.985426 | 14000000 | 500,000 | Free | 0 | 1.0 | High |
| 2 | 0.0 | 4.942058 | 8.7000000 | 5,000,000 | Free | 0 | 1.0 | High |
| 3 | 0.0 | 5.333737 | 25000000 | 50,000,000 | Free | 0 | 4.0 | High |
| 4 | 0.0 | 2.985426 | 2.8000000 | 100,000 | Free | 0 | 1.0 | High |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10833 | 3.0 | 1.643453 | 619000 | 1,000 | Free | 0 | 1.0 | High |
| 10834 | 11.0 | 0.845098 | 2.6000000 | 500 | Free | 0 | 1.0 | High |
| 10836 | 11.0 | 1.579784 | 53000000 | 5,000 | Free | 0 | 1.0 | High |
| 10837 | 11.0 | 0.602060 | 3.6000000 | 100 | Free | 0 | 1.0 | High |
| 10840 | 18.0 | 5.600218 | 19000000 | 10,000,000 | Free | 0 | 1.0 | High |
7424 rows × 8 columns
3.Type
# Getting Unique types from Type column.
df["Type"].unique()
array(['Free', 'Paid'], dtype=object)
# Transforming the Type Column using ordinal Encoding.
df["Type"] = enc.fit_transform(df[["Type"]])
df
| Category | Reviews | Size | Installs | Type | Price | Content Rating | Rating_category | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 2.201397 | 19000000 | 10,000 | 0.0 | 0 | 1.0 | High |
| 1 | 0.0 | 2.985426 | 14000000 | 500,000 | 0.0 | 0 | 1.0 | High |
| 2 | 0.0 | 4.942058 | 8.7000000 | 5,000,000 | 0.0 | 0 | 1.0 | High |
| 3 | 0.0 | 5.333737 | 25000000 | 50,000,000 | 0.0 | 0 | 4.0 | High |
| 4 | 0.0 | 2.985426 | 2.8000000 | 100,000 | 0.0 | 0 | 1.0 | High |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10833 | 3.0 | 1.643453 | 619000 | 1,000 | 0.0 | 0 | 1.0 | High |
| 10834 | 11.0 | 0.845098 | 2.6000000 | 500 | 0.0 | 0 | 1.0 | High |
| 10836 | 11.0 | 1.579784 | 53000000 | 5,000 | 0.0 | 0 | 1.0 | High |
| 10837 | 11.0 | 0.602060 | 3.6000000 | 100 | 0.0 | 0 | 1.0 | High |
| 10840 | 18.0 | 5.600218 | 19000000 | 10,000,000 | 0.0 | 0 | 1.0 | High |
7424 rows × 8 columns
4.Rating Category
# Getting Unique categories from Rating_category column.
df["Rating_category"].unique()
array(['High', 'Low'], dtype=object)
# Transforming the Rating_category Column using ordinal Encoding.
df["Rating_category"] = enc.fit_transform(df[["Rating_category"]])
df
| Category | Reviews | Size | Installs | Type | Price | Content Rating | Rating_category | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 2.201397 | 19000000 | 10,000 | 0.0 | 0 | 1.0 | 0.0 |
| 1 | 0.0 | 2.985426 | 14000000 | 500,000 | 0.0 | 0 | 1.0 | 0.0 |
| 2 | 0.0 | 4.942058 | 8.7000000 | 5,000,000 | 0.0 | 0 | 1.0 | 0.0 |
| 3 | 0.0 | 5.333737 | 25000000 | 50,000,000 | 0.0 | 0 | 4.0 | 0.0 |
| 4 | 0.0 | 2.985426 | 2.8000000 | 100,000 | 0.0 | 0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10833 | 3.0 | 1.643453 | 619000 | 1,000 | 0.0 | 0 | 1.0 | 0.0 |
| 10834 | 11.0 | 0.845098 | 2.6000000 | 500 | 0.0 | 0 | 1.0 | 0.0 |
| 10836 | 11.0 | 1.579784 | 53000000 | 5,000 | 0.0 | 0 | 1.0 | 0.0 |
| 10837 | 11.0 | 0.602060 | 3.6000000 | 100 | 0.0 | 0 | 1.0 | 0.0 |
| 10840 | 18.0 | 5.600218 | 19000000 | 10,000,000 | 0.0 | 0 | 1.0 | 0.0 |
7424 rows × 8 columns
Segregate the target and independent features (Hint: Use Rating_category as the target)
# Creating variable X with all Independent Variables.
X = df.drop("Rating_category", axis=1)
# Creating variable Y with Dependent Variable.
Y = df[["Rating_category"]]
X
| Category | Reviews | Size | Installs | Type | Price | Content Rating | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 2.201397 | 19000000 | 10,000 | 0.0 | 0 | 1.0 |
| 1 | 0.0 | 2.985426 | 14000000 | 500,000 | 0.0 | 0 | 1.0 |
| 2 | 0.0 | 4.942058 | 8.7000000 | 5,000,000 | 0.0 | 0 | 1.0 |
| 3 | 0.0 | 5.333737 | 25000000 | 50,000,000 | 0.0 | 0 | 4.0 |
| 4 | 0.0 | 2.985426 | 2.8000000 | 100,000 | 0.0 | 0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 10833 | 3.0 | 1.643453 | 619000 | 1,000 | 0.0 | 0 | 1.0 |
| 10834 | 11.0 | 0.845098 | 2.6000000 | 500 | 0.0 | 0 | 1.0 |
| 10836 | 11.0 | 1.579784 | 53000000 | 5,000 | 0.0 | 0 | 1.0 |
| 10837 | 11.0 | 0.602060 | 3.6000000 | 100 | 0.0 | 0 | 1.0 |
| 10840 | 18.0 | 5.600218 | 19000000 | 10,000,000 | 0.0 | 0 | 1.0 |
7424 rows × 7 columns
Y
| Rating_category | |
|---|---|
| 0 | 0.0 |
| 1 | 0.0 |
| 2 | 0.0 |
| 3 | 0.0 |
| 4 | 0.0 |
| ... | ... |
| 10833 | 0.0 |
| 10834 | 0.0 |
| 10836 | 0.0 |
| 10837 | 0.0 |
| 10840 | 0.0 |
7424 rows × 1 columns
# importing library to splitting the dataset.
from sklearn.model_selection import train_test_split
#Spliting the data set into Train and Test.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30 , random_state=1)
X_train
| Category | Reviews | Size | Installs | Type | Price | Content Rating | |
|---|---|---|---|---|---|---|---|
| 8888 | 0.0 | 0.000000 | 2.9000000 | 100 | 0.0 | 0 | 1.0 |
| 1298 | 15.0 | 5.264019 | 57000000 | 5,000,000 | 0.0 | 0 | 1.0 |
| 6599 | 20.0 | 2.489958 | 5.9000000 | 10,000 | 0.0 | 0 | 1.0 |
| 7328 | 30.0 | 2.328380 | 6.1000000 | 5,000 | 1.0 | 3.99 | 1.0 |
| 5681 | 18.0 | 1.755875 | 5.8000000 | 10,000 | 0.0 | 0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1522 | 17.0 | 2.731589 | 99000000 | 50,000 | 0.0 | 0 | 1.0 |
| 7755 | 11.0 | 1.462398 | 37000000 | 1,000 | 0.0 | 0 | 2.0 |
| 5978 | 11.0 | 0.000000 | 2.0000000 | 50 | 1.0 | 1.49 | 1.0 |
| 347 | 6.0 | 6.679963 | 11000000 | 500,000,000 | 0.0 | 0 | 1.0 |
| 7698 | 14.0 | 0.698970 | 12000000 | 100 | 0.0 | 0 | 1.0 |
5196 rows × 7 columns
X_test
| Category | Reviews | Size | Installs | Type | Price | Content Rating | |
|---|---|---|---|---|---|---|---|
| 5708 | 29.0 | 2.845718 | 7.3000000 | 100,000 | 0.0 | 0 | 1.0 |
| 7215 | 11.0 | 2.559907 | 14000000 | 1,000 | 1.0 | 2.99 | 2.0 |
| 1350 | 15.0 | 4.852901 | 49000000 | 1,000,000 | 0.0 | 0 | 1.0 |
| 5727 | 29.0 | 4.895583 | 6.3000000 | 1,000,000 | 0.0 | 0 | 1.0 |
| 5326 | 11.0 | 3.997910 | 3.7000000 | 100,000 | 0.0 | 0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 6351 | 29.0 | 3.066699 | 350000 | 100,000 | 0.0 | 0 | 1.0 |
| 1634 | 18.0 | 3.493319 | 13000000 | 100,000 | 0.0 | 0 | 1.0 |
| 9865 | 3.0 | 1.518514 | 37000000 | 1,000 | 0.0 | 0 | 1.0 |
| 10396 | 11.0 | 5.345748 | 94000000 | 5,000,000 | 0.0 | 0 | 4.0 |
| 6550 | 11.0 | 1.812913 | 24000000 | 10,000 | 0.0 | 0 | 1.0 |
2228 rows × 7 columns
Y_train
| Rating_category | |
|---|---|
| 8888 | 0.0 |
| 1298 | 0.0 |
| 6599 | 0.0 |
| 7328 | 0.0 |
| 5681 | 0.0 |
| ... | ... |
| 1522 | 0.0 |
| 7755 | 0.0 |
| 5978 | 1.0 |
| 347 | 0.0 |
| 7698 | 0.0 |
5196 rows × 1 columns
Y_test
| Rating_category | |
|---|---|
| 5708 | 0.0 |
| 7215 | 0.0 |
| 1350 | 0.0 |
| 5727 | 0.0 |
| 5326 | 0.0 |
| ... | ... |
| 6351 | 0.0 |
| 1634 | 0.0 |
| 9865 | 0.0 |
| 10396 | 0.0 |
| 6550 | 0.0 |
2228 rows × 1 columns
#Importing necessary libraries to Standardize the data.
from sklearn.preprocessing import StandardScaler
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7424 entries, 0 to 10840 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Category 7424 non-null float64 1 Reviews 7424 non-null float64 2 Size 7424 non-null object 3 Installs 7424 non-null object 4 Type 7424 non-null float64 5 Price 7424 non-null object 6 Content Rating 7424 non-null float64 7 Rating_category 7424 non-null float64 dtypes: float64(5), object(3) memory usage: 522.0+ KB
Since the columns named as Size, Installs, Price are of object data type, we need to convert the data to int dtype in order to perform Standardizing
df['Installs'] = df['Installs'].apply(lambda x : x.replace(',','').replace('+','')).astype(int)
df['Size'] = df['Size'].apply(lambda x : x.replace(',','')).astype(float)
df['Price'] = df['Price'].apply(lambda x : x.replace(',','')).astype(float)
#defining standard scaler
scaler = StandardScaler()
# Performing Standardization of data
df = scaler.fit_transform(df)
df
array([[-2.03766618, -0.67995305, -0.07422317, ..., -0.06305842,
-0.46322046, -0.35358018],
[-2.03766618, -0.19653147, -0.27499084, ..., -0.06305842,
-0.46322046, -0.35358018],
[-2.03766618, 1.0099001 , -0.83713998, ..., -0.06305842,
-0.46322046, -0.35358018],
...,
[-0.68621673, -1.0632313 , 1.29099702, ..., -0.06305842,
-0.46322046, -0.35358018],
[-0.68621673, -1.66608206, -0.83714018, ..., -0.06305842,
-0.46322046, -0.35358018],
[ 0.17379656, 1.41571255, -0.07422317, ..., -0.06305842,
-0.46322046, -0.35358018]])
df = pd.DataFrame(df)
df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|---|
| 0 | -2.037666 | -0.679953 | -0.074223 | -0.168764 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 1 | -2.037666 | -0.196531 | -0.274991 | -0.158181 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 2 | -2.037666 | 1.009900 | -0.837140 | -0.060990 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 3 | -2.037666 | 1.251404 | 0.166698 | 0.910911 | -0.282029 | -0.063058 | 2.517665 | -0.35358 |
| 4 | -2.037666 | -0.196531 | -0.837140 | -0.166820 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7419 | -1.669089 | -1.023974 | -0.812285 | -0.168958 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7420 | -0.686217 | -1.516228 | -0.837140 | -0.168969 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7421 | -0.686217 | -1.063231 | 1.290997 | -0.168872 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7422 | -0.686217 | -1.666082 | -0.837140 | -0.168977 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7423 | 0.173797 | 1.415713 | -0.074223 | 0.046999 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
7424 rows × 8 columns
df.columns=['Category','Reviews','Size','Installs','Type','Price','Content Rating','Rating_Category']
df
| Category | Reviews | Size | Installs | Type | Price | Content Rating | Rating_Category | |
|---|---|---|---|---|---|---|---|---|
| 0 | -2.037666 | -0.679953 | -0.074223 | -0.168764 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 1 | -2.037666 | -0.196531 | -0.274991 | -0.158181 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 2 | -2.037666 | 1.009900 | -0.837140 | -0.060990 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 3 | -2.037666 | 1.251404 | 0.166698 | 0.910911 | -0.282029 | -0.063058 | 2.517665 | -0.35358 |
| 4 | -2.037666 | -0.196531 | -0.837140 | -0.166820 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7419 | -1.669089 | -1.023974 | -0.812285 | -0.168958 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7420 | -0.686217 | -1.516228 | -0.837140 | -0.168969 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7421 | -0.686217 | -1.063231 | 1.290997 | -0.168872 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7422 | -0.686217 | -1.666082 | -0.837140 | -0.168977 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
| 7423 | 0.173797 | 1.415713 | -0.074223 | 0.046999 | -0.282029 | -0.063058 | -0.463220 | -0.35358 |
7424 rows × 8 columns
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df1=pd.read_csv("C:\\SZ\\GLCA DA PROJECTS\\New folder\\honeyproduction.csv")
df1
| state | numcol | yieldpercol | totalprod | stocks | priceperlb | prodvalue | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | AL | 16000.0 | 71 | 1136000.0 | 159000.0 | 0.72 | 818000.0 | 1998 |
| 1 | AZ | 55000.0 | 60 | 3300000.0 | 1485000.0 | 0.64 | 2112000.0 | 1998 |
| 2 | AR | 53000.0 | 65 | 3445000.0 | 1688000.0 | 0.59 | 2033000.0 | 1998 |
| 3 | CA | 450000.0 | 83 | 37350000.0 | 12326000.0 | 0.62 | 23157000.0 | 1998 |
| 4 | CO | 27000.0 | 72 | 1944000.0 | 1594000.0 | 0.70 | 1361000.0 | 1998 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 621 | VA | 4000.0 | 41 | 164000.0 | 23000.0 | 3.77 | 618000.0 | 2012 |
| 622 | WA | 62000.0 | 41 | 2542000.0 | 1017000.0 | 2.38 | 6050000.0 | 2012 |
| 623 | WV | 6000.0 | 48 | 288000.0 | 95000.0 | 2.91 | 838000.0 | 2012 |
| 624 | WI | 60000.0 | 69 | 4140000.0 | 1863000.0 | 2.05 | 8487000.0 | 2012 |
| 625 | WY | 50000.0 | 51 | 2550000.0 | 459000.0 | 1.87 | 4769000.0 | 2012 |
626 rows × 8 columns
df1.head(5) # first few samples
| state | numcol | yieldpercol | totalprod | stocks | priceperlb | prodvalue | year | |
|---|---|---|---|---|---|---|---|---|
| 0 | AL | 16000.0 | 71 | 1136000.0 | 159000.0 | 0.72 | 818000.0 | 1998 |
| 1 | AZ | 55000.0 | 60 | 3300000.0 | 1485000.0 | 0.64 | 2112000.0 | 1998 |
| 2 | AR | 53000.0 | 65 | 3445000.0 | 1688000.0 | 0.59 | 2033000.0 | 1998 |
| 3 | CA | 450000.0 | 83 | 37350000.0 | 12326000.0 | 0.62 | 23157000.0 | 1998 |
| 4 | CO | 27000.0 | 72 | 1944000.0 | 1594000.0 | 0.70 | 1361000.0 | 1998 |
df1.shape # 626 rows and 8 columns (shape checked)
(626, 8)
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 626 entries, 0 to 625 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 state 626 non-null object 1 numcol 626 non-null float64 2 yieldpercol 626 non-null int64 3 totalprod 626 non-null float64 4 stocks 626 non-null float64 5 priceperlb 626 non-null float64 6 prodvalue 626 non-null float64 7 year 626 non-null int64 dtypes: float64(5), int64(2), object(1) memory usage: 39.2+ KB
data consists of only on column 'State' as of Object type and all others are numerical column . There are 626 rows and 8 column of shape(626,8)
dist = df1.groupby('year').size()/len(df1)*100 #calculating distribution per year
dist
year 1998 6.869010 1999 6.869010 2000 6.869010 2001 7.028754 2002 7.028754 2003 7.028754 2004 6.549521 2005 6.549521 2006 6.549521 2007 6.549521 2008 6.549521 2009 6.389776 2010 6.389776 2011 6.389776 2012 6.389776 dtype: float64
plt.pie(dist, labels=dist.index,autopct='%1.1f%%')
plt.title("Distribution")
plt.show()
sns.displot(data=df1['priceperlb'])
<seaborn.axisgrid.FacetGrid at 0x174a082e1a0>
INFERENCE Most prolongly taken average price found between 1.1$ to 1.5$ which shows a gradual increase
plt.scatter(df1['numcol'], df1['prodvalue'])
plt.xlabel('Number of Colonies')
plt.ylabel('Production Value')
plt.title('numcol vs prodval')
plt.show()
INFERENCE : According to my perception , as the Number of Colonies increases , Production Vlue also increases,establishing a positive corelation
sns.boxplot(x=df1['year'],y=df1['prodvalue'],linewidth=None)
plt.figure(figsize=(50,50))
plt.show()
<Figure size 5000x5000 with 0 Axes>
INFERENCE : Year 2010 has the highest 'prodvalue'.
sns.pairplot(df1, vars=['numcol', 'yieldpercol', 'totalprod', 'prodvalue', 'year'], hue='year')
<seaborn.axisgrid.PairGrid at 0x174a1b3a050>
INFERENCE : linear relationship is found between the pair of variables "prodval" and "numcol", "prodval" and "totalprod"
corr_matrix=df1[['numcol', 'yieldpercol', 'totalprod', 'stocks', 'priceperlb', 'prodvalue']].corr()
corr_matrix
| numcol | yieldpercol | totalprod | stocks | priceperlb | prodvalue | |
|---|---|---|---|---|---|---|
| numcol | 1.000000 | 0.243515 | 0.953594 | 0.825929 | -0.232701 | 0.912796 |
| yieldpercol | 0.243515 | 1.000000 | 0.396252 | 0.367812 | -0.358646 | 0.278977 |
| totalprod | 0.953594 | 0.396252 | 1.000000 | 0.878830 | -0.264499 | 0.907236 |
| stocks | 0.825929 | 0.367812 | 0.878830 | 1.000000 | -0.305867 | 0.728560 |
| priceperlb | -0.232701 | -0.358646 | -0.264499 | -0.305867 | 1.000000 | -0.089567 |
| prodvalue | 0.912796 | 0.278977 | 0.907236 | 0.728560 | -0.089567 | 1.000000 |
#plotting the heatmap
sns.heatmap(corr_matrix, annot=True)
plt.title("Correlation Matrix")
plt.show()
INFERENCE 'totalprod' and 'numcol' has the highest positive correlation . Priceperlb and stocks shows maximum negative relationship between them